DGN-WB Elastic Net 10-fold CV Results for gene expression prediction (SNPs within 1Mb of each gene)

Compare alpha=0.5 to alpha=1

library(ggplot2)
library(reshape2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
"%&%" = function(a,b) paste(a,b,sep="")
alpha1 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_elasticNet_alpha1_imputedSNPs_chr1-22_2015-02-02.txt', header=T)
alpha0.5 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_elasticNet_alpha0.5_imputedSNPs_chr1-22_2015-02-02.txt', header=T)
dim(alpha1)
## [1] 13171     8
dim(alpha0.5)
## [1] 13171     8
en <- inner_join(alpha1,alpha0.5,by="gene")
with(en,plot(R2.x,R2.y,xlab='alpha=1 R2',ylab='alpha=0.5 R2',xlim=c(0,1),ylim=c(0,1),main='E-N DGN-WB all 13K protein coding genes',cex=0.5,col=rgb(100,100,100,100,maxColorValue=255)))
abline(0,1,col="red")

with(en,t.test(R2.x>R2.y))
## 
##  One Sample t-test
## 
## data:  R2.x > R2.y
## t = 111.7318, df = 11232, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.5171606 0.5356303
## sample estimates:
## mean of x 
## 0.5263954

Compare polyscore thresholds

"%&%" = function(a,b) paste(a,b,sep="")
ps4 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_polyscore_Pthresh1e-04_imputedSNPs_chr1-22_2015-02-09.txt',header=T)
ps3 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_polyscore_Pthresh0.001_imputedSNPs_chr1-22_2015-02-09.txt',header=T)
ps2 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_polyscore_Pthresh0.01_imputedSNPs_chr1-22_2015-02-09.txt',header=T)
ps05 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_polyscore_Pthresh0.05_imputedSNPs_chr1-22_2015-02-09.txt',header=T)
ps5 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_polyscore_Pthresh0.5_imputedSNPs_chr1-22_2015-02-09.txt',header=T)
ps1 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_polyscore_Pthresh1_imputedSNPs_chr1-22_2015-02-09.txt',header=T)
ps4 <- ps4 %>% select(gene,CV.R2)
ps3 <- ps3 %>% select(gene,CV.R2)
ps2 <- ps2 %>% select(gene,CV.R2)
ps05 <- ps05 %>% select(gene,CV.R2)
ps5 <- ps5 %>% select(gene,CV.R2)
ps1 <- ps1 %>% select(gene,CV.R2)
all <- inner_join(ps4,ps3,by='gene')
colnames(all) <- c('gene',0.0001,0.001)
all <- inner_join(all,ps2,by='gene')
colnames(all) <- c('gene',0.0001,0.001,0.01)
all <- inner_join(all,ps05,by='gene')
colnames(all) <- c('gene',0.0001,0.001,0.01,0.05)
all <- inner_join(all,ps5,by='gene')
colnames(all) <- c('gene',0.0001,0.001,0.01,0.05,0.5)
all <- inner_join(all,ps1,by='gene')
colnames(all) <- c('gene',0.0001,0.001,0.01,0.05,0.5,1)
all <- all[complete.cases(all),]
all_long <- melt(all,by=gene)
## Using gene as id variables
ngenes<-dim(all)[1]
p <- ggplot(all_long, aes(x = -log10(as.numeric(levels(variable))[variable]), y = value), group=gene) + geom_line(lwd = 0.5, show_guide = FALSE) + aes(color = gene) + xlab("-log10(p) threshold") + ylab("R2") + ggtitle("Polyscore DGN-WB (" %&% ngenes %&% " protein coding genes)")
print(p)

p <- ggplot(all_long, aes(x = -log10(as.numeric(levels(variable))[variable]), y = value), group=gene) + geom_line(lwd = 0.5, show_guide = FALSE) + aes(color = gene) + xlab("-log10(p) threshold") + ylab("R2") + ggtitle("Polyscore DGN-WB (protein coding genes with R2>0.5)") + coord_cartesian(ylim=c(0.5,0.9))
print(p)

Compare polyscore to E-N

"%&%" = function(a,b) paste(a,b,sep="")
alpha1 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_elasticNet_alpha1_imputedSNPs_chr1-22_2015-02-02.txt', header=T)
ps4 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_polyscore_Pthresh1e-04_imputedSNPs_chr1-22_2015-02-09.txt',header=T)
ps05 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_polyscore_Pthresh0.05_imputedSNPs_chr1-22_2015-02-09.txt',header=T)
all <- inner_join(alpha1,ps4,by='gene')
all <- inner_join(all,ps05,by='gene')
with(all,plot(R2,CV.R2.x,xlab='LASSO (alpha=1) R2',ylab='Polyscore P < 0.0001 R2',xlim=c(0,1),ylim=c(0,1),main='DGN-WB predictive performance',cex=0.5,col=rgb(100,100,100,100,maxColorValue=255)))
abline(0,1,col="red")

with(all,t.test(R2>CV.R2.x))
## 
##  One Sample t-test
## 
## data:  R2 > CV.R2.x
## t = 235.2103, df = 10428, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.8343917 0.8484159
## sample estimates:
## mean of x 
## 0.8414038
with(all,plot(R2,CV.R2.y,xlab='LASSO (alpha=1) R2',ylab='Polyscore P < 0.05 R2',xlim=c(0,1),ylim=c(0,1),main='DGN-WB predictive performance',cex=0.5,col=rgb(100,100,100,100,maxColorValue=255)))
abline(0,1,col="red")

with(all,t.test(R2>CV.R2.y))
## 
##  One Sample t-test
## 
## data:  R2 > CV.R2.y
## t = 213.8756, df = 11551, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.7910726 0.8057071
## sample estimates:
## mean of x 
## 0.7983899
with(all,plot(CV.R2.x,CV.R2.y,xlab='Polyscore P < 0.0001 R2',ylab='Polyscore P < 0.05 R2',xlim=c(0,1),ylim=c(0,1),main='DGN-WB predictive performance',cex=0.5,col=rgb(100,100,100,100,maxColorValue=255)))
abline(0,1,col="red")

with(all,t.test(CV.R2.x>CV.R2.y))
## 
##  One Sample t-test
## 
## data:  CV.R2.x > CV.R2.y
## t = 128.6931, df = 10493, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.6028352 0.6214834
## sample estimates:
## mean of x 
## 0.6121593
ps2 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_polyscore_Pthresh0.01_imputedSNPs_chr1-22_2015-02-09.txt',header=T)
all <- inner_join(all,ps2,by='gene')
with(all,plot(CV.R2.x,CV.R2,xlab='Polyscore P < 0.0001 R2',ylab='Polyscore P < 0.01 R2',xlim=c(0,1),ylim=c(0,1),main='DGN-WB predictive performance',cex=0.5,col=rgb(100,100,100,100,maxColorValue=255)))
abline(0,1,col="red")

with(all,t.test(CV.R2.x>CV.R2))
## 
##  One Sample t-test
## 
## data:  CV.R2.x > CV.R2
## t = 113.1659, df = 10493, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.5401268 0.5591681
## sample estimates:
## mean of x 
## 0.5496474

Compare n.snps per model

alpha1 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_elasticNet_alpha1_imputedSNPs_chr1-22_2015-02-02.txt', header=T)
alpha0.5 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_elasticNet_alpha0.5_imputedSNPs_chr1-22_2015-02-02.txt', header=T)
ps4 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_polyscore_Pthresh1e-04_imputedSNPs_chr1-22_2015-02-09.txt',header=T)
ps05 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_polyscore_Pthresh0.05_imputedSNPs_chr1-22_2015-02-09.txt',header=T)
alpha1 <- alpha1 %>% select(gene,n.snps)
alpha0.5 <- alpha0.5 %>% select(gene,n.snps)
ps4 <- ps4 %>% select(gene,n.snps)
ps05 <- ps05 %>% select(gene,n.snps)
all <- inner_join(alpha1,alpha0.5,by='gene')
all <- inner_join(all,ps4,by='gene')
all <- inner_join(all,ps05,by='gene')
colnames(all) <- c('gene','EN.a=1','EN.a=0.5','PS.e-04','PS.0.05')
all_long <- melt(all,by=gene)
## Using gene as id variables
p <- ggplot(all_long, aes(x = variable, y = value))
p + geom_boxplot() + ggtitle("Number of SNPs in predictive model") + xlab("model") + ylab("n SNPs")
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).

p <- ggplot(all_long, aes(x = variable, y = value))
p + geom_boxplot() + ggtitle("Number of SNPs in predictive model") + xlab("model") + ylab("n SNPs") + coord_cartesian(ylim=c(0,250))
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).

Compare LASSO R2 to gcta local h2

"%&%" = function(a,b) paste(a,b,sep="")
alpha1 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_elasticNet_alpha1_imputedSNPs_chr1-22_2015-02-02.txt', header=T)
h2 <- read.table('/Users/heather/Dropbox/cis.v.trans.prediction/DGN-WB.localGRM.h2.exp.2014-08-30.txt',header=T)
all <- inner_join(alpha1,h2,by='gene')
data <- all %>% mutate(ymin = pmax(0, h2 - 2 * se), ymax = pmin(1, h2 + 2 * se) ) %>% arrange(h2)
data <- data[complete.cases(data),]
p1<-ggplot(data,aes(x=1:nrow(data),y=h2,ymin=ymin, ymax=ymax) ) + geom_pointrange(col='gray')+geom_point()
p1 + geom_point(data=data,aes(x=1:nrow(data),y=R2),color='red',cex=0.5) + xlab("Genes sorted by h2") + ylab("h2 (black) or R2 (red)") + ggtitle('DGN-WB Local h2 (black) and LASSO 10-fold CV R2 (red)')

##calc %genes with R2 that reaches lower bound of h2 estimate
count<-data$R2>data$ymin
table(count)
## count
## FALSE  TRUE 
##   609 10943
sum(table(count))
## [1] 11552
table(count)/sum(table(count))
## count
##      FALSE       TRUE 
## 0.05271814 0.94728186

Compare polyscore R2 to gcta local h2

"%&%" = function(a,b) paste(a,b,sep="")
ps4 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_polyscore_Pthresh1e-04_imputedSNPs_chr1-22_2015-02-09.txt',header=T)
h2 <- read.table('/Users/heather/Dropbox/cis.v.trans.prediction/DGN-WB.localGRM.h2.exp.2014-08-30.txt',header=T)
all <- inner_join(ps4,h2,by='gene')
data <- all %>% mutate(ymin = pmax(0, h2 - 2 * se), ymax = pmin(1, h2 + 2 * se) ) %>% arrange(h2)
data <- data[complete.cases(data),]
p1<-ggplot(data,aes(x=1:nrow(data),y=h2,ymin=ymin, ymax=ymax) ) + geom_pointrange(col='gray')+geom_point()
p1 + geom_point(data=data,aes(x=1:nrow(data),y=CV.R2),color='red',cex=0.5) + xlab("Genes sorted by h2") + ylab("h2 (black) or R2 (red)") + ggtitle('DGN-WB Local h2 (black) and Polyscore (P<e-04) 10-fold CV R2 (red)')

##calc %genes with R2 that reaches lower bound of h2 estimate
count<-data$CV.R2>data$ymin
table(count)
## count
## FALSE  TRUE 
##  2510  7984
sum(table(count))
## [1] 10494
table(count)/sum(table(count))
## count
##     FALSE      TRUE 
## 0.2391843 0.7608157
##calc prop genes with h2>0.1
a<-h2$h2>0.1
table(a)/sum(table(a))
## a
##     FALSE      TRUE 
## 0.6281506 0.3718494
##calc prop genes with h2>0.01
a<-h2$h2>0.01
table(a)/sum(table(a))
## a
##     FALSE      TRUE 
## 0.1851553 0.8148447

Compare gcta local h2 to mean expression level

h2 <- read.table('/Users/heather/Dropbox/cis.v.trans.prediction/DGN-WB.localGRM.h2.exp.2014-08-30.txt',header=T)
exp <- read.table('/Users/heather/Dropbox/cis.v.trans.prediction/DGN-WB.mean.median.expression.2015-02-23.txt',header=T)
all <- inner_join(h2,exp,by='gene')
dim(all)
## [1] 13648     6
ggplot(all,aes(x=h2,y=expmeans)) + geom_point() + geom_smooth(method = "lm") + xlab("Local h2") + ylab("Mean expression") + ggtitle("DGN-WB (each point is a gene)")

summary(lm(expmeans~h2,all))
## 
## Call:
## lm(formula = expmeans ~ h2, data = all)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.117e-03 -1.898e-04 -1.300e-07  1.889e-04  1.095e-03 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.225e-05  3.114e-06  -3.935 8.37e-05 ***
## h2           1.798e-05  1.565e-05   1.149    0.251    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0002864 on 13646 degrees of freedom
## Multiple R-squared:  9.673e-05,  Adjusted R-squared:  2.345e-05 
## F-statistic:  1.32 on 1 and 13646 DF,  p-value: 0.2506
##No relationship between mean expression and h2
ggplot(all,aes(x=h2,y=expmedians)) + geom_point() + geom_smooth(method = "lm") + xlab("Local h2") + ylab("Median expression") + ggtitle("DGN-WB (each point is a gene)")

summary(lm(expmedians~h2,all))
## 
## Call:
## lm(formula = expmedians ~ h2, data = all)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.55834 -0.02875  0.00293  0.03249  0.54593 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.0307528  0.0006562  46.865  < 2e-16 ***
## h2          -0.0158342  0.0032970  -4.803 1.58e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06034 on 13646 degrees of freedom
## Multiple R-squared:  0.001687,   Adjusted R-squared:  0.001614 
## F-statistic: 23.06 on 1 and 13646 DF,  p-value: 1.583e-06
##Lower median expression has a larger h2
toph2 <- all[all[,2]>0.1,]
dim(toph2)
## [1] 5075    6
ggplot(toph2,aes(x=h2,y=expmedians)) + geom_point() + geom_smooth(method = "lm") + xlab("Local h2") + ylab("Median expression") + ggtitle("DGN-WB h2>0.1 (each point is a gene)")

summary(lm(expmedians~h2,toph2))
## 
## Call:
## lm(formula = expmedians ~ h2, data = toph2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.51324 -0.03158  0.00332  0.03658  0.49316 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.031877   0.002092  15.239  < 2e-16 ***
## h2          -0.018384   0.006507  -2.825  0.00474 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.07698 on 5073 degrees of freedom
## Multiple R-squared:  0.001571,   Adjusted R-squared:  0.001374 
## F-statistic: 7.982 on 1 and 5073 DF,  p-value: 0.004742
##Lower median expression has a larger h2

Compare polyscore thresholds

"%&%" = function(a,b) paste(a,b,sep="")
library(GGally)
## Warning: package 'GGally' was built under R version 3.1.2
## 
## Attaching package: 'GGally'
## 
## The following object is masked from 'package:dplyr':
## 
##     nasa
alpha1 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_elasticNet_alpha1_imputedSNPs_chr1-22_2015-02-02.txt', header=T)
alpha0.5 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_elasticNet_alpha0.5_imputedSNPs_chr1-22_2015-02-02.txt', header=T)
topsnp <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_polyscore_topSNP_imputedSNPs_chr1-22_2015-03-13.txt',header=T)
ps4 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_polyscore_Pthresh1e-04_imputedSNPs_chr1-22_2015-02-09.txt',header=T)
ps3 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_polyscore_Pthresh0.001_imputedSNPs_chr1-22_2015-02-09.txt',header=T)
ps2 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_polyscore_Pthresh0.01_imputedSNPs_chr1-22_2015-02-09.txt',header=T)
ps05 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_polyscore_Pthresh0.05_imputedSNPs_chr1-22_2015-02-09.txt',header=T)
ps5 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_polyscore_Pthresh0.5_imputedSNPs_chr1-22_2015-02-09.txt',header=T)
ps1 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_polyscore_Pthresh1_imputedSNPs_chr1-22_2015-02-09.txt',header=T)
alpha1 <- alpha1 %>% select(gene,R2)
alpha0.5<-alpha0.5 %>% select(gene,R2)
topsnp <- topsnp %>% select(gene,R2)
ps4 <- ps4 %>% select(gene,CV.R2)
ps3 <- ps3 %>% select(gene,CV.R2)
ps2 <- ps2 %>% select(gene,CV.R2)
ps05 <- ps05 %>% select(gene,CV.R2)
ps5 <- ps5 %>% select(gene,CV.R2)
ps1 <- ps1 %>% select(gene,CV.R2)
all <- inner_join(alpha1,alpha0.5,by='gene')
colnames(all) <- c('gene','lasso','eNet')
all <- inner_join(all,topsnp,by='gene')
colnames(all) <- c('gene','lasso','eNet','topSNP')
all <- inner_join(all,ps4,by='gene')
colnames(all) <- c('gene','lasso','eNet','topSNP','poly0.0001')
all <- inner_join(all,ps3,by='gene')
colnames(all) <- c('gene','lasso','eNet','topSNP','poly0.0001','poly0.001')
all <- inner_join(all,ps2,by='gene')
colnames(all) <- c('gene','lasso','eNet','topSNP','poly0.0001','poly0.001','poly0.01')
all <- inner_join(all,ps05,by='gene')
colnames(all) <- c('gene','lasso','eNet','topSNP','poly0.0001','poly0.001','poly0.01','poly0.05')
all <- inner_join(all,ps5,by='gene')
colnames(all) <- c('gene','lasso','eNet','topSNP','poly0.0001','poly0.001','poly0.01','poly0.05','poly0.5')
all <- inner_join(all,ps1,by='gene')
colnames(all) <- c('gene','lasso','eNet','topSNP','poly0.0001','poly0.001','poly0.01','poly0.05','poly0.5','poly1')
all <- all[complete.cases(all),]
p<-ggpairs(all[,2:10],lower=list(continuous="points",params=c(cex=0.7,ylim=c(0,1),xlim=c(0,1))),diag=list(continuous='blank'),title="Predictive Performance (R2 of GReX vs. observed expression)")
tiff(filename="ggpairs_DGN-WB_10-fCV_with_topSNP.tiff",width=1100,height=1100)
print(p)
dev.off()
## pdf 
##   2
png(filename="ggpairs_DGN-WB_10-fCV_with_topSNP.png",width=1100,height=1100)
print(p)
dev.off()
## pdf 
##   2
print(p)

Compare LASSO and Polyscore R2 to gcta local h2

"%&%" = function(a,b) paste(a,b,sep="")
alpha1 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_elasticNet_alpha1_imputedSNPs_chr1-22_2015-02-02.txt', header=T)
h2 <- read.table('/Users/heather/Dropbox/cis.v.trans.prediction/DGN-WB.localGRM.h2.exp.2014-08-30.txt',header=T)
ps4 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_polyscore_Pthresh1e-04_imputedSNPs_chr1-22_2015-02-09.txt',header=T)
all <- inner_join(alpha1,h2,by='gene')
all <- inner_join(all,ps4,by='gene')
data <- all %>% mutate(ymin = pmax(0, h2 - 2 * se), ymax = pmin(1, h2 + 2 * se) ) %>% arrange(h2)
data <- data[complete.cases(data),]
p1<-ggplot(data,aes(x=1:nrow(data),y=h2,ymin=ymin, ymax=ymax) ) + geom_pointrange(col='gray')+geom_point()
p1<-p1 + geom_point(data=data,aes(x=1:nrow(data),y=R2),color='red',cex=0.5) + xlab("Genes sorted by h2") + ylab("h2 (black) or R2 (red)") + ggtitle('LASSO predictive performance (R2) compared to local heritability (h2)')
p2<-ggplot(data,aes(x=1:nrow(data),y=h2,ymin=ymin, ymax=ymax) ) + geom_pointrange(col='gray')+geom_point()
p2<-p2 + geom_point(data=data,aes(x=1:nrow(data),y=CV.R2),color='red',cex=0.5) + xlab("Genes sorted by h2") + ylab("h2 (black) or R2 (red)") + ggtitle('Polygenic score (P<1e-04) predictive performance (R2) compared to local heritability (h2)')
library(grid)
pushViewport(viewport(layout = grid.layout(2, 1)))
print(p1, vp = viewport(layout.pos.row = 1, layout.pos.col = 1))
print(p2, vp = viewport(layout.pos.row = 2, layout.pos.col = 1))

Compare LASSO and Polyscore R2 to gcta local h2

"%&%" = function(a,b) paste(a,b,sep="")
alpha1 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_elasticNet_alpha1_imputedSNPs_chr1-22_2015-02-02.txt', header=T)
h2 <- read.table('/Users/heather/Dropbox/cis.v.trans.prediction/DGN-WB.localGRM.h2.exp.2014-08-30.txt',header=T)
ps4 <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_polyscore_Pthresh1e-04_imputedSNPs_chr1-22_2015-02-09.txt',header=T)
topsnp <- read.table('/Users/heather/Dropbox/elasticNet_testing/DGN-imputed-CV_20150209/DGN-WB_exp_10-foldCV_polyscore_topSNP_imputedSNPs_chr1-22_2015-03-13.txt',header=T)
alpha1h2 <- inner_join(alpha1,h2,by='gene')
dataL <- alpha1h2 %>% mutate(ymin = pmax(0, h2 - 2 * se), ymax = pmin(1, h2 + 2 * se)) %>% arrange(h2) %>% select(gene,R2,h2,ymin,ymax)
dataL <- dataL %>% mutate(method="LASSO",position=1:nrow(dataL))
ps4h2 <- inner_join(ps4,h2,by='gene')
dataP <- ps4h2 %>% mutate(ymin = pmax(0, h2 - 2 * se), ymax = pmin(1, h2 + 2 * se)) %>% arrange(h2) %>% select(gene,CV.R2,h2,ymin,ymax)
dataP <- dataP %>% mutate(method="Polyscore",position=1:nrow(dataP))
colnames(dataP) <- c('gene','R2','h2','ymin','ymax','method','position')
toph2 <- inner_join(topsnp,h2,by='gene')
dataT <- toph2 %>% mutate(ymin = pmax(0, h2 - 2 * se), ymax = pmin(1, h2 + 2 * se)) %>% arrange(h2) %>% select(gene,R2,h2,ymin,ymax)
dataT <- dataT %>% mutate(method="Top SNP",position=1:nrow(dataT))
colnames(dataT) <- c('gene','R2','h2','ymin','ymax','method','position')
data<-rbind(dataL,dataP,dataT)
data <- data[complete.cases(data),]
p1<-ggplot(data,aes(x=position,y=h2,ymin=ymin, ymax=ymax) ) + geom_pointrange(col='gray')+geom_point()+facet_wrap(~method,ncol=1)
p1<- p1 + geom_point(data=data,aes(x=position,y=R2),color='red',cex=0.8) + xlab(expression("Genes sorted by h"^2)) + ylab(expression(h^{2} ~"(black) or " ~ R^{2} ~ "(red)")) +theme_bw(30)
png(filename="compareR2_h2_lasso_poly_top.png",width=720,height=960)
print(p1)
dev.off()
## pdf 
##   2
tiff(filename="compareR2_h2_lasso_poly_top.tiff",width=720,height=960)
print(p1)
dev.off()
## pdf 
##   2
p1

##calc %genes with R2 that reaches lower bound of h2 estimate
###LASSO
count<-dataL$R2>dataL$ymin
table(count)
## count
## FALSE  TRUE 
##   609 10943
sum(table(count))
## [1] 11552
table(count)/sum(table(count))
## count
##      FALSE       TRUE 
## 0.05271814 0.94728186
###Polyscore
count<-dataP$R2>dataP$ymin
table(count)
## count
## FALSE  TRUE 
##  2510  7984
sum(table(count))
## [1] 10494
table(count)/sum(table(count))
## count
##     FALSE      TRUE 
## 0.2391843 0.7608157
###TopSNP
count<-dataT$R2>dataT$ymin
table(count)
## count
## FALSE  TRUE 
##  2081 11088
sum(table(count))
## [1] 13169
table(count)/sum(table(count))
## count
##     FALSE      TRUE 
## 0.1580226 0.8419774
##calc %genes with R2 that reaches lower bound of h2 estimate, only include genes with h2>0.1
###LASSO
topdataL<-dataL %>% filter(h2>0.1)
count<-topdataL$R2>topdataL$ymin
table(count)
## count
## FALSE  TRUE 
##   546  4340
sum(table(count))
## [1] 4886
table(count)/sum(table(count))
## count
##     FALSE      TRUE 
## 0.1117479 0.8882521
###Polyscore
topdataP<-dataP %>% filter(h2>0.1)
count<-topdataP$R2>topdataP$ymin
table(count)
## count
## FALSE  TRUE 
##  2141  2744
sum(table(count))
## [1] 4885
table(count)/sum(table(count))
## count
##     FALSE      TRUE 
## 0.4382805 0.5617195
###TopSNP
topdataT<-dataT %>% filter(h2>0.1)
count<-topdataT$R2>topdataT$ymin
table(count)
## count
## FALSE  TRUE 
##  1755  3131
sum(table(count))
## [1] 4886
table(count)/sum(table(count))
## count
##     FALSE      TRUE 
## 0.3591895 0.6408105